
%%%------------------------------------
%%% @author  eric.huang
%%% @Description: 敏感词检测
%%% @Created : 1-Apr-2015
%%%------------------------------------
-module(mod_word).

-export([
        load_db_sensitive_word/0,
        word_is_sensitive_talk/1,
        replace_sensitive_talk/1
    ]).

-define(ETS_SENSITIVE_TALK, ets_sensitive_talk).
-define(FETCH_LIMIT, 1000).

%%
%% API Functions
%%

%% 加载聊天相关过滤
load_db_sensitive_word() ->
    case ets:info(?ETS_SENSITIVE_TALK) of
        undefined ->
            ets:new(?ETS_SENSITIVE_TALK, [named_table, public, set, {read_concurrency, true}]);
        _ ->
            ets:delete_all_objects(?ETS_SENSITIVE_TALK)
    end,
    import_words_talk(?ETS_SENSITIVE_TALK, 0),
    ok.

import_words_talk(EtsName, Offset) ->
    Sql = io_lib:format(<<"SELECT word FROM sensitive_word LIMIT ~p,~p">>,[Offset,?FETCH_LIMIT]),
    case lib_mysql:do_query(Sql,"load_db_sensitive_word") of
        {error, Err} ->
            lager:error("import_words_talk error:~p",[Err]),
            timer:sleep(1000),
            import_words_talk(EtsName, Offset);
        {ok,[]} ->
            ok;
        {ok,Terms} ->
            Convert = fun(X) ->
                              X1 = io_lib:format("~ts", [X]),
                              unicode:characters_to_list(X1)
                      end,
            Terms1 = lists:map(Convert, Terms),
            lists:foreach(fun(X)-> add_word_to_ets(X, EtsName) end, Terms1),
            import_words_talk(EtsName, Offset + ?FETCH_LIMIT)
    end.

add_word_to_ets(Word,EtsName)->
	UniString = unicode:characters_to_list(Word,unicode),
	case UniString of
		[]-> ignor;
		_->
			[HeadChar|_Left] = UniString,
			case ets:lookup(EtsName, HeadChar) of
				[]-> ets:insert(EtsName, {HeadChar,[UniString]});
				[{_H,OldList}]->
					case lists:member(UniString,OldList) of
						false->ets:insert(EtsName,{HeadChar,[UniString|OldList]});
						true-> ignor
					end
			end
	end.

%% 判断是否包含敏感词
word_is_sensitive_talk([])->
    false;
word_is_sensitive_talk(Utf8Binary) ->
    UniString = unicode:characters_to_list(Utf8Binary,unicode),
    word_is_sensitive_kernel(UniString, ?ETS_SENSITIVE_TALK).

word_is_sensitive_kernel([], _EtsName)->
	false;
word_is_sensitive_kernel(UniString, EtsName)->
	[HeadChar|TailString] = UniString,
	UniStrLen = length(UniString),
	WordList = get_key_char_wordlist(HeadChar,EtsName),
	Match = fun(Word)->
					WordLen = length(Word),
					if WordLen> UniStrLen-> false; %%小于敏感词长度直接false
					   WordLen =:=	UniStrLen->	UniString =:= Word; %%等于直接比较
					   true-> %%大于取词比较
						   HeadStr = lists:sublist(UniString,WordLen),
						   HeadStr =:= Word
					end
			end,
	case lists:any(Match, WordList) of
		true-> true;
		false-> word_is_sensitive_kernel(TailString,EtsName)
	end.

%% 替换敏感词
replace_sensitive_talk(<<>>) ->
    <<>>;
replace_sensitive_talk(undefined) ->
    undefined;
replace_sensitive_talk(Utf8String) when is_binary(Utf8String) ->
	UniString = case unicode:characters_to_list(Utf8String,unicode) of
        {error, _, _RestData} -> [];
        {incomplete, _, _} -> [];
        Data -> Data
    end,
	ReplacedString = replace_sensitive_kernel(UniString,[],?ETS_SENSITIVE_TALK),											
	unicode:characters_to_binary(ReplacedString, utf8);
replace_sensitive_talk(InputString)when is_list(InputString)->
	Utf8Binary = unicode:characters_to_binary(InputString),
	replace_sensitive_talk(Utf8Binary);
replace_sensitive_talk(InputString)->
	InputString.

match_of_replace_sensitive_kernel(Word,Last,InputString,InputStrLen)->
    case Last of
        0 ->
            WordLen = length(Word),
            if WordLen>InputStrLen -> 0;
                WordLen=:=InputStrLen->
                    if(InputString =:= Word)->
                            WordLen;
                        true->
                            0
                    end;
                true->
                    HeadStr = lists:sublist(InputString,length(Word)),
                    if(HeadStr =:= Word)->
                            WordLen;
                        true->
                            0
                    end
            end;
        _ -> Last
    end.

replace_sensitive_kernel([],LastRepaced, _EtsName)->
	LastRepaced;
replace_sensitive_kernel(InputString,LastReplaced,EtsName)->
    private_replace_sensitive_kernel(InputString,LastReplaced,EtsName).


%% 检测屏蔽字，并替换
private_replace_sensitive_kernel(InputString,LastReplaced,EtsName)->
	[HeadChar|TailString] = InputString,
	WordList = get_key_char_wordlist(HeadChar,EtsName),
	InputStrLen = length(InputString),
	Match = fun(Word,Last)->
			match_of_replace_sensitive_kernel(Word,Last,InputString,InputStrLen)
	end,			
	case lists:foldl(Match,0 ,WordList) of
		0-> 
			NewReplaced = LastReplaced ++ [HeadChar],
			replace_sensitive_kernel(TailString,NewReplaced,EtsName);
		SensWordLen->
			LeftString = lists:sublist(InputString, SensWordLen + 1, InputStrLen - SensWordLen ),
			NewReplaced = LastReplaced ++ make_sensitive_show_string(SensWordLen),
			replace_sensitive_kernel(LeftString,NewReplaced,EtsName)
	end.

get_key_char_wordlist(KeyChar,EtsName)->
	case ets:lookup(EtsName,KeyChar) of
		[]-> [];
		[{_H,WordList}]-> WordList
	end.

make_sensitive_show_string(1)->
	"*";
make_sensitive_show_string(2)->
	"*&";
make_sensitive_show_string(3)->
	"*&^";
make_sensitive_show_string(4)->
	"*&^%";
make_sensitive_show_string(5)->
	"*&^%$";
make_sensitive_show_string(6)->
	"*&^%$#";
make_sensitive_show_string(7)->
	"*&^%$#@";
make_sensitive_show_string(8)->
	"*&^%$#@!";
make_sensitive_show_string(N)->
	M = N rem 8,
	C = N div 8,
	L1 = make_sensitive_show_string(M),
	L2 = lists:append(lists:duplicate(C,"*&^%$#@!")),
	lists:append([L2,L1]).


